# Kernel: sgemm_nn_32x128

# ******************************************************************************
# Copyright 2014-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************


<CONSTANT_MAPPING>
    addr_zero  : 4x<128*16*2 + (32*16 + 32)*2>
    szShareA : (32*16 + 32)
    szShareB : (128*16)

    gridDimA : c[0x0][0x14]
    gridDimB : c[0x0][0x18]

    param_C[0]      : c[0x0][0x140]
    param_C[1]      : c[0x0][0x144]
    param_A[0]      : c[0x0][0x148]
    param_A[1]      : c[0x0][0x14c]
    param_B[0]      : c[0x0][0x150]
    param_B[1]      : c[0x0][0x154]
    param_alpha     : c[0x0][0x158]
    param_beta      : c[0x0][0x15c]
    param_flags     : c[0x0][0x160]
    param_lda       : c[0x0][0x164]
    param_ldb8      : c[0x0][0x168]
    param_ldc       : c[0x0][0x16c]
    param_m         : c[0x0][0x170]
    param_n         : c[0x0][0x174]
    param_k         : c[0x0][0x178]
    param_ldaz      : c[0x0][0x17c]
    param_ldbz      : c[0x0][0x180]
    param_ldcz      : c[0x0][0x184]
    param_loops     : c[0x0][0x188]
</CONSTANT_MAPPING>

<REGISTER_MAPPING>

    32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3>

    0-31 : czero<00-31>

     3, 2,11,10 : cx<0-3>y0
     7, 6,15,14 : cx<0-3>y1
     1, 0, 9, 8 : cx<0-3>y2
     5, 4,13,12 : cx<0-3>y3
    19,18,27,26 : cx<0-3>y4
    23,22,31,30 : cx<0-3>y5
    17,16,25,24 : cx<0-3>y6
    21,20,29,28 : cx<0-3>y7

      32-43 : j0Ay<0-7>, j0Bx<0-3>
      44-55 : j1Ay<0-7>, j1Bx<0-3>
      56-67 : j2Ay<0-7>, j2Bx<0-3>
      68-79 : j3Ay<0-7>, j3Bx<0-3>

      80-83 : loadA<0-3>
      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>

    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>

    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb
    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ

    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
    40-47 : c<0-3>, d3, d2, d1, d0
   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags

</REGISTER_MAPPING>

--:-:1:-:1      S2R tid,  SR_TID.X;
--:-:2:-:1      S2R blkB, SR_CTAID.Z;
--:-:3:-:1      S2R blkA, SR_CTAID.Y;
--:-:4:-:1      S2R blkZ, SR_CTAID.X;

<SCHEDULE_BLOCK>
--:-:-:-:1      MOV k,    param_k;
--:-:-:-:1      MOV lda,  param_lda;
--:-:-:-:1      MOV ldb,  param_ldb8;
--:-:-:-:1      SHR.U32 ldb, ldb, 5;
--:-:-:-:1      MOV ldaz, param_ldaz;
--:-:-:-:1      MOV ldbz, param_ldbz;
--:-:-:-:1      SHL ldb4,  ldb, 2;
--:-:-:-:1      SHL ldb16, ldb, 6;

--:-:-:-:1      STS.128 [addr_zero], RZ;
<CODE>
    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
</CODE>

// tidAX   = tid >> 2
// tidAY   = (tid & 3) << 2
// shiftAX = (tid & 3) << 3
01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
01:-:-:-:1      LOP.AND tid3,    tid,  3;
--:-:-:-:1      SHL     tidAY,   tid3, 2;
--:-:-:-:1      SHL     shiftAX, tid3, 3;

// tidBX = (tid & 31) << 2
// tidBY = (tid >> 5)
01:-:-:-:1      LOP.AND tidBX, tid,   31;
--:-:-:-:1      SHL     tidBX, tidBX, 2;
--:-:-:-:1      SHR.U32 tidBY, tid,   5;

// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 4
04:-:-:-:1      ISCADD   txa, blkA, tidAX, 5;
--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidAY, xmad_ta;
08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     2;
--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 2;

// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4
02:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
--:-:-:-:1      XMAD.LO2 tb0, ldb,  tidBY, txb;
08:-:-:-:1      XMAD.LO2 tb0, ldbz, blkZ,  tb0;
--:-:-:-:1      IADD     tb1, tb0, ldb4;
--:-:-:-:1      IADD     tb2, tb1, ldb4;
--:-:-:-:1      IADD     tb3, tb2, ldb4;

--:-:-:-:1      LEA      track0B0.CC, tb0, param_B[0],     2;
--:-:-:-:1      LEA.HI.X track0B1,    tb0, param_B[1], RZ, 2;
--:-:-:-:1      LEA      track1B0.CC, tb1, param_B[0],     2;
--:-:-:-:1      LEA.HI.X track1B1,    tb1, param_B[1], RZ, 2;
--:-:-:-:1      LEA      track2B0.CC, tb2, param_B[0],     2;
--:-:-:-:1      LEA.HI.X track2B1,    tb2, param_B[1], RZ, 2;
--:-:-:-:1      LEA      track3B0.CC, tb3, param_B[0],     2;
--:-:-:-:1      LEA.HI.X track3B1,    tb3, param_B[1], RZ, 2;

// writeAs = (tidAY*32 + tidAX + shiftAX) * 4
--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 5;
--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;

// writeBs = (tidBY*128 + tidBX) * 4
--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 7;
--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;

// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
--:-:-:-:1      LOP.AND tid1,   tid,    1;
--:-:-:-:1      LOP.AND readAs, tid,    16;
--:-:-:-:1      SHR.U32 readAs, readAs, 3;
--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
--:-:-:-:1      SHL     readAs, readAs, 4;

// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
01:-:-:-:1      LOP.AND tid96,  tid,    96;
--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;

--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
</SCHEDULE_BLOCK>

REMAINDER:

<SCHEDULE_BLOCK>

--:-:-:-:1      IADD tidBY1, tidBY, 4;
--:-:-:-:1      IADD tidBY2, tidBY, 8;
--:-:-:-:1      IADD tidBY3, tidBY, 12;

<CODE>
    our $vec;
    return $vec ? q{
--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;

--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY,  k, P5;
--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY1, k, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY2, k, P5;
--:-:-:-:1      ISETP.LT.AND P3, PT, tidBY3, k, P5;
--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY,  k, P6;

<ORDERED>
--:-:1:-:1  @P0 LDG.E.CI.128 load0B, [track0B];
--:-:2:-:1  @P1 LDG.E.CI.128 load1B, [track1B];
--:-:3:-:1  @P2 LDG.E.CI.128 load2B, [track2B];
--:-:4:-:1  @P3 LDG.E.CI.128 load3B, [track3B];
--:-:5:-:1  @P4 LDG.E.CI.128 loadA,  [trackA];
</ORDERED>

<ORDERED>
--:-:6:-:1 @!P0 LDS.U.128 load0B, [addr_zero];
--:-:6:-:1 @!P1 LDS.U.128 load1B, [addr_zero];
--:-:6:-:1 @!P2 LDS.U.128 load2B, [addr_zero];
--:-:6:-:1 @!P3 LDS.U.128 load3B, [addr_zero];
--:-:6:-:1 @!P4 LDS.U.128 loadA,  [addr_zero];
</ORDERED>

    } : q{

--:-:-:-:1      IADD tidAY1, tidAY, 1;
--:-:-:-:1      IADD tidAY2, tidAY, 2;
--:-:-:-:1      IADD tidAY3, tidAY, 3;

--:-:-:-:1      IADD txb1,  txb,  1;
--:-:-:-:1      IADD txb2,  txb,  2;
--:-:-:-:1      IADD txb3,  txb,  3;

--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
<ORDERED>
--:-:1:-:1  @P0 LDG.E.CI load0B0, [track0B + 4x<0>];
--:-:1:-:1  @P1 LDG.E.CI load0B1, [track0B + 4x<1>];
--:-:1:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
--:-:1:-:1  @P3 LDG.E.CI load0B3, [track0B + 4x<3>];
</ORDERED>
--:-:-:-:1 @!P0 MOV load0B0, RZ;
--:-:-:-:1 @!P1 MOV load0B1, RZ;
--:-:-:-:1 @!P2 MOV load0B2, RZ;
--:-:-:-:1 @!P3 MOV load0B3, RZ;

--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY1, k, PT;
--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P5;
--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P5;
--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P5;
<ORDERED>
--:-:2:-:1  @P0 LDG.E.CI load1B0, [track1B + 4x<0>];
--:-:2:-:1  @P1 LDG.E.CI load1B1, [track1B + 4x<1>];
--:-:2:-:1  @P2 LDG.E.CI load1B2, [track1B + 4x<2>];
--:-:2:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
</ORDERED>
--:-:-:-:1 @!P0 MOV load1B0, RZ;
--:-:-:-:1 @!P1 MOV load1B1, RZ;
--:-:-:-:1 @!P2 MOV load1B2, RZ;
--:-:-:-:1 @!P3 MOV load1B3, RZ;

--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY2, k, PT;
--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
<ORDERED>
--:-:3:-:1  @P0 LDG.E.CI load2B0, [track2B + 4x<0>];
--:-:3:-:1  @P1 LDG.E.CI load2B1, [track2B + 4x<1>];
--:-:3:-:1  @P2 LDG.E.CI load2B2, [track2B + 4x<2>];
--:-:3:-:1  @P3 LDG.E.CI load2B3, [track2B + 4x<3>];
</ORDERED>
--:-:-:-:1 @!P0 MOV load2B0, RZ;
--:-:-:-:1 @!P1 MOV load2B1, RZ;
--:-:-:-:1 @!P2 MOV load2B2, RZ;
--:-:-:-:1 @!P3 MOV load2B3, RZ;

--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY3, k, PT;
--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
<ORDERED>
--:-:4:-:1  @P0 LDG.E.CI load3B0, [track3B + 4x<0>];
--:-:4:-:1  @P1 LDG.E.CI load3B1, [track3B + 4x<1>];
--:-:4:-:1  @P2 LDG.E.CI load3B2, [track3B + 4x<2>];
--:-:4:-:1  @P3 LDG.E.CI load3B3, [track3B + 4x<3>];
</ORDERED>
--:-:-:-:1 @!P0 MOV load3B0, RZ;
--:-:-:-:1 @!P1 MOV load3B1, RZ;
--:-:-:-:1 @!P2 MOV load3B2, RZ;
--:-:-:-:1 @!P3 MOV load3B3, RZ;

--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;

--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P6;
--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P6;
--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P6;
--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P6;
<ORDERED>
--:-:5:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
--:-:5:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
--:-:5:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
--:-:5:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
</ORDERED>
--:-:-:-:1 @!P0 MOV loadA0, RZ;
--:-:-:-:1 @!P1 MOV loadA1, RZ;
--:-:-:-:1 @!P2 MOV loadA2, RZ;
--:-:-:-:1 @!P3 MOV loadA3, RZ;

    };
</CODE>

--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;

// bDoRemainder = k & 15 && k > 16
--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
--:-:-:-:0      ISETP.GT.AND P1, PT, k, 16, P0;

</SCHEDULE_BLOCK>

21:-:-:-:1      STS.128 [writeBs + 4x<0*128>], load0B;
--:-:-:-:6      IADD   track0B0.CC, track0B0, ldb16;
--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;

02:-:-:-:1      STS.128 [writeBs + 4x<4*128>], load1B;
--:-:-:-:6      IADD   track1B0.CC, track1B0, ldb16;
--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;

04:-:-:-:1      STS.128 [writeBs + 4x<8*128>], load2B;
--:-:-:-:6      IADD   track2B0.CC, track2B0, ldb16;
--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;

08:-:-:-:1      STS.128 [writeBs + 4x<12*128>], load3B;
--:-:-:-:6      IADD   track3B0.CC, track3B0, ldb16;
--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;

10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
--:-:-:-:0      IADD   trackA0.CC, trackA0, 4x<16>;
--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;

--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;

--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;

<CODE>
    our $vec;
    return $vec ? q{
--:-:3:-:1  @P2 LDG.E.CI.128 load0B, [track0B];
--:-:4:-:1  @P3 LDG.E.CI.128 load1B, [track1B];
--:-:5:-:1  @P5 LDG.E.CI.128 load2B, [track2B];
--:-:5:-:1  @P5 LDG.E.CI.128 load3B, [track3B];
--:-:6:-:1  @P6 LDG.E.CI.128 loadA,  [trackA];
    } : q{
--:-:3:-:1  @P2 LDG.E.CI load0B0, [track0B + 4x<0>];
--:-:3:-:1  @P2 LDG.E.CI load0B1, [track0B + 4x<1>];
--:-:3:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
--:-:3:-:1  @P2 LDG.E.CI load0B3, [track0B + 4x<3>];

--:-:4:-:1  @P3 LDG.E.CI load1B0, [track1B + 4x<0>];
--:-:4:-:1  @P3 LDG.E.CI load1B1, [track1B + 4x<1>];
--:-:4:-:1  @P3 LDG.E.CI load1B2, [track1B + 4x<2>];
--:-:4:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];

--:-:5:-:1  @P5 LDG.E.CI load2B0, [track2B + 4x<0>];
--:-:5:-:1  @P5 LDG.E.CI load2B1, [track2B + 4x<1>];
--:-:5:-:1  @P5 LDG.E.CI load2B2, [track2B + 4x<2>];
--:-:5:-:1  @P5 LDG.E.CI load2B3, [track2B + 4x<3>];

--:-:5:-:1  @P5 LDG.E.CI load3B0, [track3B + 4x<0>];
--:-:5:-:1  @P5 LDG.E.CI load3B1, [track3B + 4x<1>];
--:-:5:-:1  @P5 LDG.E.CI load3B2, [track3B + 4x<2>];
--:-:5:-:1  @P5 LDG.E.CI load3B3, [track3B + 4x<3>];

--:-:6:-:1  @P6 LDG.E.CI loadA0,  [trackA + 4x<0>];
--:-:6:-:1  @P6 LDG.E.CI loadA1,  [trackA + 4x<1>];
--:-:6:-:1  @P6 LDG.E.CI loadA2,  [trackA + 4x<2>];
--:-:6:-:1  @P6 LDG.E.CI loadA3,  [trackA + 4x<3>];
    };
</CODE>

<CODE>
    our $vec;
    our $shiftAX = 1;
    our $shiftBX = 0;
    our %insert =
    (
        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",

        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n",
        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n",
        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n",
        j9c6   => "--:5:-:-:1  \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n",
        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",

        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, ldb16;\n",
        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb16;\n",
        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
        j7c7   => "--:-:-:-:1  \@P5 IADD   track2B0.CC, track2B0, ldb16;\n",
        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2B1,    track2B1, RZ;\n",
        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, ldb16;\n",
        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC,  trackA0, 4x<16>;\n",
        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,     trackA1, RZ;\n",

        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",

        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",

        ($vec ?
            (
                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0B, [track0B];\n",
                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1B, [track1B];\n",
                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.128 load2B, [track2B];\n",
                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3B, [track3B];\n",
                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadA,  [trackA];\n",
            ) :
            (

                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n",
                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n",
                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n",
                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n",

                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n",
                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n",
                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n",
                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n",

                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI load2B0, [track2B + 4x<0>];\n",
                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI load2B1, [track2B + 4x<1>];\n",
                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI load2B2, [track2B + 4x<2>];\n",
                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI load2B3, [track2B + 4x<3>];\n",

                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n",
                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n",
                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n",
                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n",

                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
            )
        ),

        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
    );
    return '';
</CODE>

<INCLUDE file="sgemm_common_32x128.sass"/>
